{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 102 - Training Regression Algorithms with the L-BFGS Solver\n",
    "\n",
    "In this example, we run a linear regression on the *Flight Delay* dataset to predict the delay times.\n",
    "\n",
    "We demonstrate how to use the `TrainRegressor` and the `ComputePerInstanceStatistics` APIs.\n",
    "\n",
    "First, import the packages."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import mmlspark"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Next, import the CSV dataset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load raw data from small-sized 30 MB CSV file (trimmed to contain just what we use)\n",
    "dataFile = \"On_Time_Performance_2012_9.csv\"\n",
    "import os, urllib\n",
    "if not os.path.isfile(dataFile):\n",
    "    urllib.request.urlretrieve(\"https://mmlspark.azureedge.net/datasets/\"+dataFile, dataFile)\n",
    "flightDelay = spark.createDataFrame(\n",
    "    pd.read_csv(dataFile, dtype={\"Month\": np.float64, \"Quarter\": np.float64,\n",
    "                                 \"DayofMonth\": np.float64, \"DayOfWeek\": np.float64,\n",
    "                                 \"OriginAirportID\": np.float64, \"DestAirportID\": np.float64,\n",
    "                                 \"CRSDepTime\": np.float64, \"CRSArrTime\": np.float64}))\n",
    "# Print information on the dataset we loaded\n",
    "print(\"records read: \" + str(flightDelay.count()))\n",
    "print(\"Schema:\")\n",
    "flightDelay.printSchema()\n",
    "flightDelay.limit(10).toPandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Split the dataset into train and test sets."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train,test = flightDelay.randomSplit([0.75, 0.25])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Train a regressor on dataset with `l-bfgs`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from mmlspark import TrainRegressor, TrainedRegressorModel\n",
    "from pyspark.ml.regression import LinearRegression\n",
    "from pyspark.ml.feature import StringIndexer\n",
    "# Convert columns to categorical\n",
    "catCols = [\"Carrier\", \"DepTimeBlk\", \"ArrTimeBlk\"]\n",
    "trainCat = train\n",
    "testCat = test\n",
    "for catCol in catCols:\n",
    "    simodel = StringIndexer(inputCol=catCol, outputCol=catCol + \"Tmp\").fit(train)\n",
    "    trainCat = simodel.transform(trainCat).drop(catCol).withColumnRenamed(catCol + \"Tmp\", catCol)\n",
    "    testCat = simodel.transform(testCat).drop(catCol).withColumnRenamed(catCol + \"Tmp\", catCol)\n",
    "lr = LinearRegression().setSolver(\"l-bfgs\").setRegParam(0.1).setElasticNetParam(0.3)\n",
    "model = TrainRegressor(model=lr, labelCol=\"ArrDelay\").fit(trainCat)\n",
    "model.write().overwrite().save(\"flightDelayModel.mml\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Score the regressor on the test data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "flightDelayModel = TrainedRegressorModel.load(\"flightDelayModel.mml\")\n",
    "scoredData = flightDelayModel.transform(testCat)\n",
    "scoredData.limit(10).toPandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Compute model metrics against the entire scored dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from mmlspark import ComputeModelStatistics\n",
    "metrics = ComputeModelStatistics().transform(scoredData)\n",
    "metrics.toPandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Finally, compute and show per-instance statistics, demonstrating the usage\n",
    "of `ComputePerInstanceStatistics`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from mmlspark import ComputePerInstanceStatistics\n",
    "evalPerInstance = ComputePerInstanceStatistics().transform(scoredData)\n",
    "evalPerInstance.select(\"ArrDelay\", \"Scores\", \"L1_loss\", \"L2_loss\").limit(10).toPandas()"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
